PC Open 100

home *** CD-ROM | disk | FTP | other *** search

/ PC Open 100 / PC Open 100 CD 1.bin / CD1 / INTERNET / EMAIL / pop file / setup.exe / Classifier / Bayes.pm next >

Wrap

Perl POD Document | 2004-03-16 | 122.3 KB | 3,497 lines

# POPFILE LOADABLE MODULE package Classifier::Bayes; use POPFile::Module; @ISA = ("POPFile::Module"); # --------------------------------------------------------------------------------------------- # # Bayes.pm --- Naive Bayes text classifier # # Copyright (c) 2001-2003 John Graham-Cumming # # This file is part of POPFile # # POPFile is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # POPFile is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with POPFile; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Modified by Sam Schinke (sschinke@users.sourceforge.net) # Merged with db code from Scott Leighton (helphand@users.sourceforge.net) # # --------------------------------------------------------------------------------------------- use strict; use warnings; use locale; use Classifier::MailParse; use IO::Handle; use DBI; use Digest::MD5 qw( md5_hex ); # This is used to get the hostname of the current machine # in a cross platform way use Sys::Hostname; # A handy variable containing the value of an EOL for networks my $eol = "\015\012"; # Korean characters definition my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])'; my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])'; my $ksc5601_hanja = '(?:[\xCA-\xFD][\xA1-\xFE])'; my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)"; my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc #---------------------------------------------------------------------------- # new # # Class new() function #---------------------------------------------------------------------------- sub new { my $type = shift; my $self = POPFile::Module->new(); # Set this to 1 to get scores for individual words in message detail $self->{wordscores__} = 0; # Choice for the format of the "word matrix" display. $self->{wmformat__} = ''; # Just our hostname $self->{hostname__} = ''; # File Handle for DBI database $self->{db__} = {}; # To save time we also 'prepare' some commonly used SQL statements and cache # them here, see the function db_connect__ for details $self->{db_get_buckets__} = 0; $self->{db_get_wordid__} = 0; $self->{db_get_word_count__} = 0; $self->{db_put_word_count__} = 0; $self->{db_get_bucket_unique_counts__} = 0; $self->{db_get_unique_word_count__} = 0; $self->{db_get_bucket_word_counts__} = 0; $self->{db_get_full_total__} = 0; $self->{db_get_bucket_parameter__} = 0; $self->{db_set_bucket_parameter__} = 0; $self->{db_get_bucket_parameter_default__} = 0; $self->{db_get_buckets_with_magnets__} = 0; $self->{db_delete_zero_words__} = 0; # Caches the name of each bucket and relates it to both the bucket ID in the # database and whether it is pseudo or not # # Subkeys used are: # # id The bucket ID in the database # pseudo 1 if this is a pseudo bucket $self->{db_bucketid__} = {}; # Caches the IDs that map to parameter types $self->{db_parameterid__} = {}; # Caches looked up parameter values on a per bucket basis $self->{db_parameters__} = {}; # Used to parse mail messages $self->{parser__} = new Classifier::MailParse; # The possible colors for buckets $self->{possible_colors__} = [ 'red', 'green', 'blue', 'brown', # PROFILE BLOCK START 'orange', 'purple', 'magenta', 'gray', 'plum', 'silver', 'pink', 'lightgreen', 'lightblue', 'lightcyan', 'lightcoral', 'lightsalmon', 'lightgrey', 'darkorange', 'darkcyan', 'feldspar' ]; # PROFILE BLOCK STOP # Precomputed per bucket probabilities $self->{bucket_start__} = {}; # A very unlikely word $self->{not_likely__} = {}; # The expected corpus version # # DEPRECATED This is only used when upgrading old flat file corpus files # to the database $self->{corpus_version__} = 1; # The unclassified cutoff this value means that the top probabilily must be n times greater than the # second probability, default is 100 times more likely $self->{unclassified__} = log(100); # Used to tell the caller whether a magnet was used in the last # mail classification $self->{magnet_used__} = 0; $self->{magnet_detail__} = ''; # This maps session keys (long strings) to user ids. If there's an entry here then the session key # is valid and can be used in the POPFile API. See the methods get_session_key and release_session_key # for details $self->{api_sessions__} = {}; # Must call bless before attempting to call any methods bless $self, $type; $self->name( 'bayes' ); return $self; } # --------------------------------------------------------------------------------------------- # # forked # # This is called inside a child process that has just forked, since the child needs access # to the database we open it # # --------------------------------------------------------------------------------------------- sub forked { my ( $self ) = @_; $self->db_connect__(); } # --------------------------------------------------------------------------------------------- # # initialize # # Called to set up the Bayes module's parameters # # --------------------------------------------------------------------------------------------- sub initialize { my ( $self ) = @_; # This is the name for the database $self->config_( 'database', 'popfile.db' ); # This is the 'connect' string used by DBI to connect to the database, if # you decide to change from using SQLite to some other database (e.g. MySQL, # Oracle, ... ) this *should* be all you need to change. The additional # parameters user and auth are needed for some databases. # # Note that the dbconnect string will be interpolated before being passed # to DBI and the variable $dbname can be used within it and it resolves to # the full path to the database named in the database parameter above. $self->config_( 'dbconnect', 'dbi:SQLite:dbname=$dbname' ); $self->config_( 'dbuser', '' ); $self->config_( 'dbauth', '' ); # No default unclassified weight is the number of times more sure POPFile # must be of the top class vs the second class, default is 100 times more $self->config_( 'unclassified_weight', 100 ); # The corpus is kept in the 'corpus' subfolder of POPFile # # DEPRECATED This is only used to find an old corpus that might need to # be upgraded $self->config_( 'corpus', 'corpus' ); # The characters that appear before and after a subject modification $self->config_( 'subject_mod_left', '[' ); $self->config_( 'subject_mod_right', ']' ); # Get the hostname for use in the X-POPFile-Link header $self->{hostname__} = hostname; # Allow the user to override the hostname $self->config_( 'hostname', $self->{hostname__} ); return 1; } # --------------------------------------------------------------------------------------------- # # start # # Called to start the Bayes module running # # --------------------------------------------------------------------------------------------- sub start { my ( $self ) = @_; # Pass in the current interface language for language specific parsing $self->{parser__}->{lang__} = $self->module_config_( 'html', 'language' ); $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) ); if ( !$self->db_connect__() ) { return 0; } $self->upgrade_predatabase_data__(); return 1; } # --------------------------------------------------------------------------------------------- # # stop # # Called when POPFile is terminating # # --------------------------------------------------------------------------------------------- sub stop { my ( $self ) = @_; $self->db_disconnect__(); } # --------------------------------------------------------------------------------------------- # # classified # # Called to inform the module about a classification event # # There is no return value from this method # # --------------------------------------------------------------------------------------------- sub classified { my ( $self, $session, $class ) = @_; $self->set_bucket_parameter( $session, $class, 'count', # PROFILE BLOCK START $self->get_bucket_parameter( $session, $class, 'count' ) + 1 ); # PROFILE BLOCK STOP } # --------------------------------------------------------------------------------------------- # # get_color # # Retrieves the color for a specific word, color is the most likely bucket # # $session Session key returned by get_session_key # $word Word to get the color of # # --------------------------------------------------------------------------------------------- sub get_color { my ( $self, $session, $word ) = @_; my $max = -10000; my $color = 'black'; for my $bucket ($self->get_buckets( $session )) { my $prob = $self->get_value_( $session, $bucket, $word ); if ( $prob != 0 ) { if ( $prob > $max ) { $max = $prob; $color = $self->get_bucket_parameter( $session, $bucket, 'color' ); } } } return $color; } # --------------------------------------------------------------------------------------------- # # get_not_likely_ # # Returns the probability of a word that doesn't appear # # --------------------------------------------------------------------------------------------- sub get_not_likely_ { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->{not_likely__}{$userid}; } # --------------------------------------------------------------------------------------------- # # get_value_ # # Returns the value for a specific word in a bucket. The word is converted to the log value # of the probability before return to get the raw value just hit the hash directly or call # get_base_value_ # # --------------------------------------------------------------------------------------------- sub get_value_ { my ( $self, $session, $bucket, $word ) = @_; my $value = $self->db_get_word_count__( $session, $bucket, $word ); if ( defined( $value ) && ( $value > 0 ) ) { # Profiling notes: # # I tried caching the log of the total value and then doing # log( $value ) - $cached and this turned out to be # much slower than this single log with a division in it return log( $value / $self->get_bucket_word_count( $session, $bucket ) ); } else { return 0; } } sub get_base_value_ { my ( $self, $session, $bucket, $word ) = @_; my $value = $self->db_get_word_count__( $session, $bucket, $word ); if ( defined( $value ) ) { return $value; } else { return 0; } } # --------------------------------------------------------------------------------------------- # # set_value_ # # Sets the value for a word in a bucket and updates the total word counts for the bucket # and globally # # --------------------------------------------------------------------------------------------- sub set_value_ { my ( $self, $session, $bucket, $word, $value ) = @_; if ( $self->db_put_word_count__( $session, $bucket, $word, $value ) == 1 ) { # If we set the word count to zero then clean it up by deleting the # entry my $userid = $self->valid_session_key__( $session ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db_delete_zero_words__}->execute( $bucketid ); return 1; } else { return 0; } } # --------------------------------------------------------------------------------------------- # # get_sort_value_ behaves the same as get_value_, except that it returns not_likely__ rather # than 0 if the word is not found. This makes its result more suitable as a sort key for bucket # ranking. # # --------------------------------------------------------------------------------------------- sub get_sort_value_ { my ( $self, $session, $bucket, $word ) = @_; my $v = $self->get_value_( $session, $bucket, $word ); if ( $v == 0 ) { my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->{not_likely__}{$userid}; } else { return $v; } } # --------------------------------------------------------------------------------------------- # # update_constants__ # # Updates not_likely and bucket_start # # --------------------------------------------------------------------------------------------- sub update_constants__ { my ( $self, $session ) = @_; my $wc = $self->get_word_count( $session ); my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); if ( $wc > 0 ) { $self->{not_likely__}{$userid} = -log( 10 * $wc ); foreach my $bucket ($self->get_buckets( $session )) { my $total = $self->get_bucket_word_count( $session, $bucket ); if ( $total != 0 ) { $self->{bucket_start__}{$userid}{$bucket} = log( $total / $wc ); } else { $self->{bucket_start__}{$userid}{$bucket} = 0; } } } else { $self->{not_likely__}{$userid} = 0; } } # --------------------------------------------------------------------------------------------- # # parse_with_kakasi__ # # Parse Japanese mail message with Kakasi # # Japanese needs to be parsed by language processing filter, "Kakasi" # before it is passed to Bayes classifier because words are not splitted # by spaces. # # $file The file to parse # # --------------------------------------------------------------------------------------------- sub parse_with_kakasi__ { my ( $self, $file, $dcount, $mcount ) = @_; # This is used for Japanese support require Encode; # This is used to parse Japanese require Text::Kakasi; my $temp_file = $self->get_user_path_( $self->global_config_( 'msgdir' ) . "kakasi$dcount" . "=$mcount.msg" ); # Split Japanese email body into words using Kakasi Wakachigaki # mode(-w is passed to Kakasi as argument). The most common charset of # Japanese email is ISO-2022-JP, alias is jis, so -ijis and -ojis # are passed to tell Kakasi the input charset and the output charset # explicitly. # # After Kakasi processing, Encode::from_to is used to convert into UTF-8. # # Japanese email charset is assumed to be ISO-2022-JP. Needs to expand for # other possible charset, such as Shift_JIS, EUC-JP, UTF-8. Text::Kakasi::getopt_argv("kakasi", "-w -ijis -ojis"); open KAKASI_IN, "<$file"; open KAKASI_OUT, ">$temp_file"; while( <KAKASI_IN> ){ my $kakasi_out; $kakasi_out = Text::Kakasi::do_kakasi($_); Encode::from_to($kakasi_out, "iso-2022-jp", "euc-jp"); print KAKASI_OUT $kakasi_out; } close KAKASI_OUT; close KAKASI_IN; Text::Kakasi::close_kanwadict(); unlink( $file ); rename( $temp_file, $file ); } # --------------------------------------------------------------------------------------------- # # db_connect__ # # Connects to the POPFile database and returns 1 if successful # # --------------------------------------------------------------------------------------------- sub db_connect__ { my ( $self ) = @_; # Connect to the database, note that the database must exist for this to work, # to make this easy for people POPFile we will create the database automatically # here using the file 'popfile.sql' which should be located in the same directory # the Classifier/Bayes.pm module my $dbname = $self->get_user_path_( $self->config_( 'database' ) ); my $dbpresent = ( -e $dbname ) || 0; # Now perform the connect, note that this is database independent at this point, the # actual database that we connect to is defined by the dbconnect parameter. my $dbconnect = $self->config_( 'dbconnect' ); $dbconnect =~ s/\$dbname/$dbname/g; $self->log_( "Attempting to connect to $dbconnect ($dbpresent)" ); $self->{db__} = DBI->connect( $dbconnect, # PROFILE BLOCK START $self->config_( 'dbuser' ), $self->config_( 'dbauth' ) ); # PROFILE BLOCK STOP if ( !defined( $self->{db__} ) ) { $self->log_( "Failed to connect to database and got error $DBI::errstr" ); return 0; } if ( !$dbpresent ) { if ( -e $self->get_root_path_( 'Classifier/popfile.sql' ) ) { my $schema = ''; $self->log_( "Creating database schema" ); open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' ); while ( <SCHEMA> ) { next if ( /^--/ ); next if ( !/[a-z;]/ ); s/--.*$//; # If the line begins 'alter' and we are doing SQLite then ignore # the line if ( ( $dbconnect =~ /sqlite/i ) && ( /^alter/i ) ) { next; } $schema .= $_; if ( ( /end;/ ) || ( /\);/ ) ) { $self->{db__}->do( $schema ); $schema = ''; } } close SCHEMA; } else { $self->log_( "Can't find the database schema" ); return 0; } } # Now prepare common SQL statements for use, as a matter of convention the # parameters to each statement always appear in the following order: # # user # bucket # word # parameter $self->{db_get_buckets__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select name, id, pseudo from buckets where buckets.userid = ?;' ); # PROFILE BLOCK STOP $self->{db_get_wordid__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select id from words where words.word = ? limit 1;' ); # PROFILE BLOCK STOP $self->{db_get_userid__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select id from users where name = ? and password = ? limit 1;' ); # PROFILE BLOCK STOP $self->{db_get_word_count__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select matrix.times from matrix where matrix.bucketid = ? and matrix.wordid = ? limit 1;' ); # PROFILE BLOCK STOP $self->{db_put_word_count__} = $self->{db__}->prepare( # PROFILE BLOCK START 'replace into matrix ( bucketid, wordid, times ) values ( ?, ?, ? );' ); # PROFILE BLOCK STOP $self->{db_get_bucket_unique_counts__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select count(matrix.wordid), buckets.name from matrix, buckets where buckets.userid = ? and matrix.bucketid = buckets.id group by buckets.name;' ); # PROFILE BLOCK STOP $self->{db_get_bucket_word_counts__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select sum(matrix.times), buckets.name from matrix, buckets where matrix.bucketid = buckets.id and buckets.userid = ? group by buckets.name;' ); # PROFILE BLOCK STOP $self->{db_get_unique_word_count__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select count(matrix.wordid) from matrix, buckets where matrix.bucketid = buckets.id and buckets.userid = ?;' ); # PROFILE BLOCK STOP $self->{db_get_full_total__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select sum(matrix.times) from matrix, buckets where buckets.userid = ? and matrix.bucketid = buckets.id;' ); # PROFILE BLOCK STOP $self->{db_get_bucket_parameter__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select bucket_params.val from bucket_params where bucket_params.bucketid = ? and bucket_params.btid = ?;' ); # PROFILE BLOCK STOP $self->{db_set_bucket_parameter__} = $self->{db__}->prepare( # PROFILE BLOCK START 'replace into bucket_params ( bucketid, btid, val ) values ( ?, ?, ? );' ); # PROFILE BLOCK STOP $self->{db_get_bucket_parameter_default__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select bucket_template.def from bucket_template where bucket_template.id = ?;' ); # PROFILE BLOCK STOP $self->{db_get_buckets_with_magnets__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select buckets.name from buckets, magnets where buckets.userid = ? and magnets.bucketid = buckets.id group by buckets.name order by buckets.name;' ); # PROFILE BLOCK STOP $self->{db_delete_zero_words__} = $self->{db__}->prepare( # PROFILE BLOCK START 'delete from matrix where matrix.times = 0 and matrix.bucketid = ?;' ); # PROFILE BLOCK STOP # Get the mapping from parameter names to ids into a local hash my $h = $self->{db__}->prepare( "select name, id from bucket_template;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { $self->{db_parameterid__}{$row->[0]} = $row->[1]; } $h->finish; return 1; } # --------------------------------------------------------------------------------------------- # # db_disconnect__ # # Disconnect from the POPFile database # # --------------------------------------------------------------------------------------------- sub db_disconnect__ { my ( $self ) = @_; $self->{db_get_buckets__}->finish; $self->{db_get_wordid__}->finish; $self->{db_get_userid__}->finish; $self->{db_get_word_count__}->finish; $self->{db_put_word_count__}->finish; $self->{db_get_bucket_unique_counts__}->finish; $self->{db_get_bucket_word_counts__}->finish; $self->{db_get_unique_word_count__}->finish; $self->{db_get_full_total__}->finish; $self->{db_get_bucket_parameter__}->finish; $self->{db_set_bucket_parameter__}->finish; $self->{db_get_bucket_parameter_default__}->finish; $self->{db_get_buckets_with_magnets__}->finish; $self->{db_delete_zero_words__}->finish; if ( defined( $self->{db__} ) ) { $self->{db__}->disconnect; undef $self->{db__}; } } # --------------------------------------------------------------------------------------------- # # db_update_cache__ # # Updates our local cache of user and bucket ids. # # $session Must be a valid session # # --------------------------------------------------------------------------------------------- sub db_update_cache__ { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); delete $self->{db_bucketid__}{$userid}; $self->{db_get_buckets__}->execute( $userid ); while ( my $row = $self->{db_get_buckets__}->fetchrow_arrayref ) { $self->{db_bucketid__}{$userid}{$row->[0]}{id} = $row->[1]; $self->{db_bucketid__}{$userid}{$row->[0]}{pseudo} = $row->[2]; $self->{db_bucketcount__}{$userid}{$row->[0]} = 0; } $self->{db_get_bucket_word_counts__}->execute( $userid ); for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { $self->{db_bucketcount__}{$userid}{$b} = 0; $self->{db_bucketunique__}{$userid}{$b} = 0; } while ( my $row = $self->{db_get_bucket_word_counts__}->fetchrow_arrayref ) { $self->{db_bucketcount__}{$userid}{$row->[1]} = $row->[0]; } $self->{db_get_bucket_unique_counts__}->execute( $userid ); while ( my $row = $self->{db_get_bucket_unique_counts__}->fetchrow_arrayref ) { $self->{db_bucketunique__}{$userid}{$row->[1]} = $row->[0]; } $self->update_constants__( $session ); } # --------------------------------------------------------------------------------------------- # # db_get_word_count__ # # Return the 'count' value for a word in a bucket. If the word is not found in that # bucket then returns undef. # # $session Valid session ID from get_session_key # $bucket bucket word is in # $word word to lookup # # --------------------------------------------------------------------------------------------- sub db_get_word_count__ { my ( $self, $session, $bucket, $word ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{db_get_wordid__}->execute( $word ); my $result = $self->{db_get_wordid__}->fetchrow_arrayref; if ( !defined( $result ) ) { return undef; } my $wordid = $result->[0]; $self->{db_get_word_count__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, $wordid ); $result = $self->{db_get_word_count__}->fetchrow_arrayref; if ( defined( $result ) ) { return $result->[0]; } else { return undef; } } # --------------------------------------------------------------------------------------------- # # db_put_word_count__ # # Update 'count' value for a word in a bucket, if the update fails then returns 0 # otherwise is returns 1 # # $session Valid session ID from get_session_key # $bucket bucket word is in # $word word to update # $count new count value # # --------------------------------------------------------------------------------------------- sub db_put_word_count__ { my ( $self, $session, $bucket, $word, $count ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # We need to have two things before we can start, the id of the word in the words # table (if there's none then we need to add the word), the bucket id in the buckets # table (which must exist) $word = $self->{db__}->quote($word); my $result = $self->{db__}->selectrow_arrayref( "select words.id from words where words.word = $word limit 1;"); if ( !defined( $result ) ) { $self->{db__}->do( "insert into words ( word ) values ( $word );" ); $result = $self->{db__}->selectrow_arrayref( "select words.id from words where words.word = $word limit 1;"); } my $wordid = $result->[0]; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db_put_word_count__}->execute( $bucketid, $wordid, $count ); return 1; } # --------------------------------------------------------------------------------------------- # # upgrade_predatabase_data__ # # Looks for old POPFile data (in flat files or BerkeleyDB tables) and upgrades it to the # SQL database. Data upgraded is removed. # # --------------------------------------------------------------------------------------------- sub upgrade_predatabase_data__ { my ( $self ) = @_; my $c = 0; # There's an assumption here that this is the single user version of POPFile # and hence what we do is cheat and get a session key assuming that the user # name is admin with password '' my $session = $self->get_session_key( 'admin', '' ); if ( !defined( $session ) ) { $self->log_( "Tried to get the session key for user admin and failed; cannot upgrade old data" ); return; } my @buckets = glob $self->get_user_path_( $self->config_( 'corpus' ) . '/*' ); foreach my $bucket (@buckets) { # A bucket directory must be a directory next unless ( -d $bucket ); next unless ( ( -e "$bucket/table" ) || ( -e "$bucket/table.db" ) ); return 0 if ( !$self->upgrade_bucket__( $session, $bucket ) ); my $color = ''; # See if there's a color file specified if ( open COLOR, '<' . "$bucket/color" ) { $color = <COLOR>; # Someone (who shall remain nameless) went in and manually created # empty color files in their corpus directories which would cause # $color at this point to be undefined and hence you'd get warnings # about undefined variables below. So this little test is to deal # with that user and to make POPFile a little safer which is always # a good thing if ( !defined( $color ) ) { $color = ''; } else { $color =~ s/[\r\n]//g; } close COLOR; unlink "$bucket/color"; } $bucket =~ /([[:alpha:]0-9-_]+)$/; $bucket = $1; $self->set_bucket_color( $session, $bucket, ($color eq '')?$self->{possible_colors__}[$c]:$color ); $c = ($c+1) % ($#{$self->{possible_colors__}}+1); } $self->release_session_key( $session ); return 1; } # --------------------------------------------------------------------------------------------- # # upgrade_bucket__ # # Loads an individual bucket # # $session Valid session key from get_session_key # $bucket The bucket name # # --------------------------------------------------------------------------------------------- sub upgrade_bucket__ { my ( $self, $session, $bucket ) = @_; $bucket =~ /([[:alpha:]0-9-_]+)$/; $bucket = $1; $self->create_bucket( $session, $bucket ); if ( open PARAMS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" ) ) { while ( <PARAMS> ) { s/[\r\n]//g; if ( /^([[:lower:]]+) ([^\r\n\t ]+)$/ ) { $self->set_bucket_parameter( $session, $bucket, $1, $2 ); } } close PARAMS; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" ); } # Pre v0.21.0 POPFile had GLOBAL parameters for subject modification, # XTC and XPL insertion. To make the upgrade as clean as possible # check these parameters so that if they were OFF we set the equivalent # per bucket to off foreach my $gl ( 'subject', 'xtc', 'xpl' ) { $self->log_( "Checking deprecated parameter GLOBAL_$gl for $bucket\n" ); my $val = $self->{configuration__}->deprecated_parameter( "GLOBAL_$gl" ); if ( defined( $val ) && ( $val == 0 ) ) { $self->log_( "GLOBAL_$gl is 0 for $bucket, overriding $gl\n" ); $self->set_bucket_parameter( $session, $bucket, $gl, 0 ); } } # See if there are magnets defined if ( open MAGNETS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" ) ) { while ( <MAGNETS> ) { s/[\r\n]//g; # Because of a bug in v0.17.9 and earlier of POPFile the text of # some magnets was getting mangled by certain characters having # a \ prepended. Code here removes the \ in these cases to make # an upgrade smooth. if ( /^([^ ]+) (.+)$/ ) { my $type = $1; my $value = $2; # Some people were accidently creating magnets with trailing whitespace # which really confused them later when their magnet did not match (see # comment in UI::HTML::magnet for more detail) $value =~ s/^[ \t]+//g; $value =~ s/[ \t]+$//g; $value =~ s/\$\?|\*|\||\(|$|\[|\]|\{|\}|\^|\$|\.)/$1/g; $self->create_magnet( $session, $bucket, $type, $value ); } else { # This branch is used to catch the original magnets in an # old version of POPFile that were just there for from # addresses only if ( /^(.+)$/ ) { my $value = $1; $value =~ s/\$\?|\*|\||\(|$|\[|\]|\{|\}|\^|\$|\.)/$1/g; $self->create_magnet( $session, $bucket, 'from', $value ); } } } close MAGNETS; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" ); } # If there is no existing table but there is a table file (the old style # flat file used by POPFile for corpus storage) then create the new # database from it thus performing an automatic upgrade. if ( -e $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) ) { $self->log_( "Performing automatic upgrade of $bucket corpus from flat file to DBI" ); $self->{db__}->begin_work; if ( open WORDS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) ) { my $wc = 1; my $first = <WORDS>; if ( defined( $first ) && ( $first =~ s/^__CORPUS__ __VERSION__ (\d+)// ) ) { if ( $1 != $self->{corpus_version__} ) { print STDERR "Incompatible corpus version in $bucket\n"; close WORDS; $self->{db__}->rollback; return 0; } else { $self->log_( "Upgrading bucket $bucket..." ); while ( <WORDS> ) { if ( $wc % 100 == 0 ) { $self->log_( "$wc" ); } $wc += 1; s/[\r\n]//g; if ( /^([^\s]+) (\d+)$/ ) { if ( $2 != 0 ) { $self->db_put_word_count__( $session, $bucket, $1, $2 ); } } else { $self->log_( "Found entry in corpus for $bucket that looks wrong: \"$_\" (ignoring)" ); } } } if ( $wc > 1 ) { $wc -= 1; $self->log_( "(completed $wc words)" ); } close WORDS; } else { close WORDS; $self->{db__}->rollback; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ); return 0; } $self->{db__}->commit; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ); } } # Now check to see if there's a BerkeleyDB-style table my $bdb_file = $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table.db" ); if ( -e $bdb_file ) { $self->log_( "Performing automatic upgrade of $bucket corpus from BerkeleyDB to DBI" ); require BerkeleyDB; my %h; tie %h, "BerkeleyDB::Hash", -Filename => $bdb_file; $self->log_( "Upgrading bucket $bucket..." ); $self->{db__}->begin_work; my $wc = 1; for my $word (keys %h) { if ( $wc % 100 == 0 ) { $self->log_( "$wc" ); } next if ( $word =~ /__POPFILE__(LOG__TOTAL|TOTAL|UNIQUE)__/ ); $wc += 1; if ( $h{$word} != 0 ) { $self->db_put_word_count__( $session, $bucket, $word, $h{$word} ); } } $wc -= 1; $self->log_( "(completed $wc words)" ); $self->{db__}->commit; untie %h; unlink $bdb_file; } return 1; } # --------------------------------------------------------------------------------------------- # # magnet_match_helper__ # # Helper the determines if a specific string matches a certain magnet type in a bucket, used # by magnet_match_ # # $session Valid session from get_session_key # $match The string to match # $bucket The bucket to check # $type The magnet type to check # # --------------------------------------------------------------------------------------------- sub magnet_match_helper__ { my ( $self, $session, $match, $bucket, $type ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $match = lc($match); # In Japanese and Korean mode, disable locale. # Sorting Japanese and Korean with "use locale" is memory and time consuming, # and may cause perl crash. my @magnets; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $h = $self->{db__}->prepare( # PROFILE BLOCK START "select magnets.val from magnets, users, buckets, magnet_types where buckets.id = $bucketid and users.id = buckets.userid and magnets.bucketid = buckets.id and magnet_types.mtype = '$type' and magnets.mtid = magnet_types.id order by magnets.val;" ); # PROFILE BLOCK STOP $h->execute; while ( my $row = $h->fetchrow_arrayref ) { push @magnets, ($row->[0]); } $h->finish; for my $magnet (@magnets) { $magnet = lc($magnet); for my $i (0..(length($match)-length($magnet))) { if ( substr( $match, $i, length($magnet)) eq $magnet ) { $self->{scores__} = ''; $self->{magnet_used__} = 1; $self->{magnet_detail__} = "$type: $magnet"; return 1; } } } return 0; } # --------------------------------------------------------------------------------------------- # # magnet_match__ # # Helper the determines if a specific string matches a certain magnet type in a bucket # # $session Valid session from get_session_key # $match The string to match # $bucket The bucket to check # $type The magnet type to check # # --------------------------------------------------------------------------------------------- sub magnet_match__ { my ( $self, $session, $match, $bucket, $type ) = @_; return $self->magnet_match_helper__( $session, $match, $bucket, $type ); } # --------------------------------------------------------------------------------------------- # # write_line__ # # Writes a line to a file and parses it unless the classification is already known # # $file File handle for file to write line to # $line The line to write # $class (optional) The current classification # # --------------------------------------------------------------------------------------------- sub write_line__ { my ( $self, $file, $line, $class ) = @_; print $file $line if defined( $file ); if ( $class eq '' ) { $self->{parser__}->parse_line( $line ); } } # --------------------------------------------------------------------------------------------- # # add_words_to_bucket__ # # Takes words previously parsed by the mail parser and adds/subtracts them to/from a bucket, # this is a helper used by add_messages_to_bucket, remove_message_from_bucket # # $session Valid session from get_session_key # $bucket Bucket to add to # $subtract Set to -1 means subtract the words, set to 1 means add # # --------------------------------------------------------------------------------------------- sub add_words_to_bucket__ { my ( $self, $session, $bucket, $subtract ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Map the list of words to a list of counts currently in the database # then update those counts and write them back to the database. my $words; if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) { no locale; $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) ); } else { $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) ); } $self->{get_wordids__} = $self->{db__}->prepare( # PROFILE BLOCK START "select id, word from words where word in ( $words );" ); # PROFILE BLOCK STOP $self->{get_wordids__}->execute; my @id_list; my %wordmap; while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) { push @id_list, ($row->[0]); $wordmap{$row->[1]} = $row->[0]; } $self->{get_wordids__}->finish; my $ids = join( ',', @id_list ); $self->{db_getwords__} = $self->{db__}->prepare( # PROFILE BLOCK START "select matrix.times, matrix.wordid from matrix where matrix.wordid in ( $ids ) and matrix.bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};" ); # PROFILE BLOCK STOP $self->{db_getwords__}->execute; my %counts; while ( my $row = $self->{db_getwords__}->fetchrow_arrayref ) { $counts{$row->[1]} = $row->[0]; } $self->{db_getwords__}->finish; $self->{db__}->begin_work; foreach my $word (keys %{$self->{parser__}->{words__}}) { # If there's already a count then it means that the word is already # in the database and we have its id in $wordmap{$word} so for speed we # execute the db_put_word_count__ query here rather than going through # set_value_ which would need to look up the wordid again if ( defined( $wordmap{$word} ) && defined( $counts{$wordmap{$word}} ) ) { $self->{db_put_word_count__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, # PROFILE BLOCK START $wordmap{$word}, $counts{$wordmap{$word}} + $subtract * $self->{parser__}->{words__}{$word} ); # PROFILE BLOCK STOP } else { # If the word is not in the database and we are trying to subtract then # we do nothing because negative values are meaningless if ( $subtract == 1 ) { $self->db_put_word_count__( $session, $bucket, $word, $self->{parser__}->{words__}{$word} ); } } } # If we were doing a subtract operation it's possible that some of the words # in the bucket now have a zero count and should be removed if ( $subtract == -1 ) { $self->{db_delete_zero_words__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id} ); } $self->{db__}->commit; } # --------------------------------------------------------------------------------------------- # # echo_to_dot_ # # $mail The stream (created with IO::) to send the message to (the remote mail server) # $client (optional) The local mail client (created with IO::) that needs the response # $file (optional) A file to print the response to, caller specifies open style # $before (optional) String to send to client before the dot is sent # # echo all information from the $mail server until a single line with a . is seen # # NOTE Also echoes the line with . to $client but not to $file # # Returns 1 if there was a . or 0 if reached EOF before we hit the . # # --------------------------------------------------------------------------------------------- sub echo_to_dot_ { my ( $self, $mail, $client, $file, $before ) = @_; my $hit_dot = 0; my $isopen = open FILE, "$file" if ( defined( $file ) ); binmode FILE if ($isopen); while ( my $line = $self->slurp_( $mail ) ) { # Check for an abort last if ( $self->{alive_} == 0 ); # The termination has to be a single line with exactly a dot on it and nothing # else other than line termination characters. This is vital so that we do # not mistake a line beginning with . as the end of the block if ( $line =~ /^\.(\r\n|\r|\n)$/ ) { $hit_dot = 1; if ( defined( $before ) && ( $before ne '' ) ) { print $client $before if ( defined( $client ) ); print FILE $before if ( defined( $isopen ) ); } # Note that there is no print FILE here. This is correct because we # do no want the network terminator . to appear in the file version # of any message print $client $line if ( defined( $client ) ); last; } print $client $line if ( defined( $client ) ); print FILE $line if ( defined( $isopen ) ); } close FILE if ( $isopen ); return $hit_dot; } # --------------------------------------------------------------------------------------------- # # substr_euc__ # # "substr" function which supports EUC Japanese charset # # $pos Start position # $len Word length # # --------------------------------------------------------------------------------------------- sub substr_euc__ { my ( $str, $pos, $len ) = @_; my $result_str; my $char; my $count = 0; if ( !$pos ) { $pos = 0; } if ( !$len ) { $len = length( $str ); } for ( $pos = 0; $count < $len; $pos++ ) { $char = substr( $str, $pos, 1 ); if ( $char =~ /[\x80-\xff]/ ) { $char = substr( $str, $pos++, 2 ); } $result_str .= $char; $count++; } return $result_str; } # --------------------------------------------------------------------------------------------- # # generate_unique_session_key__ # # Returns a unique string based session key that can be used as a key in the api_sessions__ # # --------------------------------------------------------------------------------------------- sub generate_unique_session_key__ { my ( $self ) = @_; my @chars = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', # PROFILE BLOCK START 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A' ); # PROFILE BLOCK STOP my $session; do { $session = ''; my $length = int( 16 + rand(4) ); for my $i (0 .. $length) { my $random = $chars[int( rand(36) )]; # Just to add spice to things we sometimes lowercase the value if ( rand(1) < rand(1) ) { $random = lc($random); } $session .= $random; } } while ( defined( $self->{api_sessions__}{$session} ) ); return $session; } # --------------------------------------------------------------------------------------------- # # valid_session_key__ # # $session Session key returned by call to get_session_key # # Returns undef is the session key is not valid, or returns the user ID associated # with the session key which can be used in database accesses # # --------------------------------------------------------------------------------------------- sub valid_session_key__ { my ( $self, $session ) = @_; # This provides protection against someone using the XML-RPC interface and calling # this API directly to fish for session keys, this must be called from within this # module return undef if ( caller ne 'Classifier::Bayes' ); # If the session key is invalid then wait 1 second. This is done to prevent # people from calling a POPFile API such as get_bucket_count with random session # keys fishing for a valid key. The XML-RPC API is single threaded and hence this # will delay all use of that API by one second. Of course in normal use when the # user knows the username/password or session key then there is no delay if ( !defined( $self->{api_sessions__}{$session} ) ) { my ( $package, $filename, $line, $subroutine ) = caller; $self->log_( "Invalid session key $session provided in $package @ $line" ); select( undef, undef, undef, 1 ); } return $self->{api_sessions__}{$session}; } # --------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------- # _____ _____ _____ _______ _____ _______ _______ _____ _____ # |_____] | | |_____] |______ | | |______ |_____| |_____] | # | |_____| | | __|__ |_____ |______ | | | __|__ # # The method below are public and may be accessed by other modules. All of them may be # accessed remotely through the XMLRPC.pm module using the XML-RPC protocol # # Note that every API function expects to be passed a $session which is obtained by first # calling get_session_key with a valid username and password. Once done call the method # release_session_key. # # --------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------- # # get_session_key # # $user The name of an existing user # $pwd The user's password # # Returns a string based session key if the username and password match, or undef if not # # --------------------------------------------------------------------------------------------- sub get_session_key { my ( $self, $user, $pwd ) = @_; # The password is stored in the database as an MD5 hash of the username and # password concatenated and separated by the string __popfile__, so compute # the hash here my $hash = md5_hex( $user . '__popfile__' . $pwd ); $self->{db_get_userid__}->execute( $user, $hash ); my $result = $self->{db_get_userid__}->fetchrow_arrayref; if ( !defined( $result ) ) { # The delay of one second here is to prevent people from trying out # username/password combinations at high speed to determine the # credentials of a valid user $self->log_( "Attempt to login with incorrect credentials for user $user" ); select( undef, undef, undef, 1 ); return undef; } my $session = $self->generate_unique_session_key__(); $self->{api_sessions__}{$session} = $result->[0]; $self->db_update_cache__( $session ); $self->log_( "get_session_key returning key $session for user $self->{api_sessions__}{$session}" ); return $session; } # --------------------------------------------------------------------------------------------- # # release_session_key # # $session A session key previously returned by get_session_key # # Releases and invalidates the session key # # --------------------------------------------------------------------------------------------- sub release_session_key { my ( $self, $session ) = @_; if ( defined( $self->{api_sessions__}{$session} ) ) { $self->log_( "release_session_key releasing key $session for user $self->{api_sessions__}{$session}" ); delete $self->{api_sessions__}{$session}; } } # --------------------------------------------------------------------------------------------- # # get_top_bucket__ # # Helper function used by classify to get the bucket with the highest score from data # stored in a matrix of information (see definition of %matrix in classify for details) # and a list of potential buckets # # $userid User ID for database access # $id ID of a word in $matrix # $matrix Reference to the %matrix hash in classify # $buckets Reference to a list of buckets # # Returns the bucket in $buckets with the highest score # # --------------------------------------------------------------------------------------------- sub get_top_bucket__ { my ( $self, $userid, $id, $matrix, $buckets ) = @_; my $best_probability = 0; my $top_bucket = 'unclassified'; for my $bucket (@$buckets) { my $probability = 0; if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) { $probability = $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket}; } if ( $probability > $best_probability ) { $best_probability = $probability; $top_bucket = $bucket; } } return $top_bucket; } # --------------------------------------------------------------------------------------------- # # classify # # $session A valid session key returned by a call to get_session_key # $file The name of the file containing the text to classify (or undef to use # the data already in the parser) # $ui Reference to the UI used when doing colorization # $matrix (optional) Reference to a hash that will be filled with the word matrix # used in classification # $idmap (optional) Reference to a hash that will map word ids in the $matrix to # actual words # # Splits the mail message into valid words, then runs the Bayes algorithm to figure out # which bucket it belongs in. Returns the bucket name # # --------------------------------------------------------------------------------------------- sub classify { my ( $self, $session, $file, $ui, $matrix, $idmap ) = @_; my $msg_total = 0; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) ); # Pass language parameter to parse_file() $self->{magnet_used__} = 0; $self->{magnet_detail__} = ''; if ( defined( $file ) ) { $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->module_config_( 'html', 'language' ), $self->global_config_( 'message_cutoff' ) ); # PROFILE BLOCK STOP } # Check to see if this email should be classified based on a magnet # Get the list of buckets my @buckets = $self->get_buckets( $session ); for my $bucket ($self->get_buckets_with_magnets( $session )) { for my $type ($self->get_magnet_types_in_bucket( $session, $bucket )) { if ( $self->magnet_match__( $session, $self->{parser__}->get_header($type), $bucket, $type ) ) { return $bucket; } } } # If the user has not defined any buckets then we escape here return unclassified return "unclassified" if ( $#buckets == -1 ); # The score hash will contain the likelihood that the given message is in each # bucket, the buckets are the keys for score # Set up the initial score as P(bucket) my %score; my %matchcount; # Build up a list of the buckets that are OK to use for classification (i.e. # that have at least one word in them). my @ok_buckets; for my $bucket (@buckets) { if ( $self->{bucket_start__}{$userid}{$bucket} != 0 ) { $score{$bucket} = $self->{bucket_start__}{$userid}{$bucket}; $matchcount{$bucket} = 0; push @ok_buckets, ( $bucket ); } } @buckets = @ok_buckets; # For each word go through the buckets and calculate P(word|bucket) and then calculate # P(word|bucket) ^ word count and multiply to the score my $word_count = 0; # The correction value is used to generate score displays in the scores__ # variable which are consistent with the word scores shown by the GUI's # word lookup feature. It is computed to make the contribution of a word # which is unrepresented in a bucket zero. This correction affects only # the values displayed in scores__; it has no effect on the classification # process. my $correction = 0; # Classification against the database works in a sequence of steps to get # the fastest time possible. The steps are as follows: # # 1. Convert the list of words returned by the parser into a list of unique # word ids that can be used in the database. This requires a select # against the database to get the word ids (and associated words) which # is then converted into two things: @id_list which is just the sorted # list of word ids and %idmap which maps a word to its id. # # 2. Then run a second select that get the triplet (count, id, bucket) for # each word id and each bucket. The triplet contains the word count from # the database for each bucket and each id, where there is an entry. That # data gets loaded into the sparse matrix %matrix. # # 3. Do the normal classification loop as before running against the @id_list # for the words and for each bucket. If there's an entry in %matrix for # the id/bucket combination then calculate the probability, otherwise use # the not_likely probability. # # NOTE. Since there is a single not_likely probability we do not worry about # the fact that the select in 1 might return a shorter list of words # than was found in the message (because some words are not in the # database) since the missing words will be the same for all buckets # and hence constitute a fixed scaling factor on all the buckets which # is irrelevant in deciding which the winning bucket is. my $words; if ( $self->module_config_( 'html', 'language' ) =~ /^Nihongo|Korean$/ ) { no locale; $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) ); } else { $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) ); } $self->{get_wordids__} = $self->{db__}->prepare( # PROFILE BLOCK START "select id, word from words where word in ( $words ) order by id;" ); # PROFILE BLOCK STOP $self->{get_wordids__}->execute; my @id_list; my %temp_idmap; if ( !defined( $idmap ) ) { $idmap = \%temp_idmap; } while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) { push @id_list, ($row->[0]); $$idmap{$row->[0]} = $row->[1]; } $self->{get_wordids__}->finish; my $ids = join( ',', @id_list ); $self->{db_classify__} = $self->{db__}->prepare( # PROFILE BLOCK START "select matrix.times, matrix.wordid, buckets.name from matrix, buckets where matrix.wordid in ( $ids ) and matrix.bucketid = buckets.id and buckets.userid = $userid;" ); # PROFILE BLOCK STOP $self->{db_classify__}->execute; # %matrix maps wordids and bucket names to counts # $matrix{$wordid}{$bucket} == $count my %temp_matrix; if ( !defined( $matrix ) ) { $matrix = \%temp_matrix; } while ( my $row = $self->{db_classify__}->fetchrow_arrayref ) { $$matrix{$row->[1]}{$row->[2]} = $row->[0]; } $self->{db_classify__}->finish; foreach my $id (@id_list) { $word_count += 2; my $wmax = -10000; foreach my $bucket (@buckets) { my $probability = 0; if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) { $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} ); } $matchcount{$bucket} += $self->{parser__}{words__}{$$idmap{$id}} if ($probability != 0); $probability = $self->{not_likely__}{$userid} if ( $probability == 0 ); $wmax = $probability if ( $wmax < $probability ); $score{$bucket} += ( $probability * $self->{parser__}{words__}{$$idmap{$id}} ); } if ($wmax > $self->{not_likely__}{$userid}) { $correction += $self->{not_likely__}{$userid} * $self->{parser__}{words__}{$$idmap{$id}}; } else { $correction += $wmax * $self->{parser__}{words__}{$$idmap{$id}}; } } # Now sort the scores to find the highest and return that bucket as the classification my @ranking = sort {$score{$b} <=> $score{$a}} keys %score; my %raw_score; my $base_score = $score{$ranking[0]}; my $total = 0; # If the first and second bucket are too close in their probabilities, call the message # unclassified. Also if there are fewer than 2 buckets. my $class = 'unclassified'; if ( @buckets > 1 && $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) { $class = $ranking[0]; } # Compute the total of all the scores to generate the normalized scores and probability # estimate. $total is always 1 after the first loop iteration, so any additional term # less than 2 ** -54 is insignificant, and need not be computed. my $ln2p_54 = -54 * log(2); foreach my $b (@ranking) { $raw_score{$b} = $score{$b}; $score{$b} -= $base_score; $total += exp($score{$b}) if ($score{$b} > $ln2p_54 ); } if ($self->{wordscores__} && defined($ui) ) { my %qm = %{$self->{parser__}->quickmagnets()}; my $mlen = scalar(keys %{$self->{parser__}->quickmagnets()}); my %language = $ui->language(); my $session_key = $ui->session_key(); if ( $mlen >= 0 ) { my @buckets = $self->get_buckets( $session ); my $i = 0; $self->{scores__} .= "<form action=\"/magnets\" method=\"get\">\n"; $self->{scores__} .= "<input type=\"hidden\" name=\"session\" value=\"$session_key\" />"; $self->{scores__} .= "<input type=\"hidden\" name=\"count\" value=\"" . ($mlen + 1) . "\" />"; $self->{scores__} .= "<hr><b>$language{QuickMagnets}</b><p>\n<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Magnet}</th>\n<th>$language{Magnet_Always}</th>\n"; my %types = $self->get_magnet_types( $session ); foreach my $type ( sort keys %types ) { if (defined $qm{$type}) { $i += 1; $self->{scores__} .= "<tr><td scope=\"col\">$type: "; $self->{scores__} .= "<select name=\"text$i\" id=\"\">\n"; foreach my $magnet ( @{$qm{$type}} ) { $self->{scores__} .= "<option value=\"$magnet\">$magnet</option>\n"; } $self->{scores__} .= "</select>\n"; $self->{scores__} .= "</td><td>"; $self->{scores__} .= "<input type=\"hidden\" name=\"type$i\" id=\"magnetsAddType\" value=\"$type\"/>"; $self->{scores__} .= "<select name=\"bucket$i\" id=\"magnetsAddBucket\">\n<option value=\"\"></option>\n"; foreach my $bucket (@buckets) { $self->{scores__} .= "<option value=\"$bucket\">$bucket</option>\n"; } $self->{scores__} .= "</select></td></tr>"; } } $self->{scores__} .= "<tr><td></td><td><input type=\"submit\" class=\"submit\" name=\"create\" value=\"$language{Create}\" /></td></tr></table></form>"; } $self->{scores__} .= "<a name=\"scores\">"; $self->{scores__} .= "<hr><b>$language{Scores}</b><p>\n"; $self->{scores__} .= "<table class=\"top20Words\">\n<tr>\n<th scope=\"col\">$language{Bucket}</th>\n<th> </th>\n"; if ($self->{wmformat__} eq 'score') { $self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\" align=\"center\">$language{Score}</th><th scope=\"col\">$language{Probability}</th></tr>\n"; } else { $self->{scores__} .= "<th scope=\"col\">$language{Count} </th><th scope=\"col\">$language{Probability}</th></tr>\n"; } my $log10 = log(10.0); foreach my $b (@ranking) { my $prob = exp($score{$b})/$total; my $probstr; my $rawstr; # If the computed probability would display as 1, display it as .999999 instead. # We don't want to give the impression that POPFile is ever completely sure of its # classification. if ($prob >= .999999) { $probstr = sprintf("%12.6f", 0.999999); } else { if ($prob >= 0.1 || $prob == 0.0) { $probstr = sprintf("%12.6f", $prob); } else { $probstr = sprintf("%17.6e", $prob); } } my $color = $self->get_bucket_color( $session, $b ); if ($self->{wmformat__} eq 'score') { $rawstr = sprintf("%12.6f", ($raw_score{$b} - $correction)/$log10); $self->{scores__} .= "<tr>\n<td><font color=\"$color\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td align=right>$rawstr </td>\n<td>$probstr</td>\n</tr>\n"; } else { $self->{scores__} .= "<tr>\n<td><font color=\"$color\"><b>$b</b></font></td>\n<td> </td>\n<td align=\"right\">$matchcount{$b} </td>\n<td>$probstr</td>\n</tr>\n"; } } $self->{scores__} .= "</table><hr>"; # We want a link to change the format here. But only the UI knows how to build # that link. So we just insert a comment which can be replaced by the UI. There's # probably a better way. $self->{scores__} .= "<p>"; if ( $self->{wmformat__} ne '' ) { $self->{scores__} .= "<table class=\"top20Words\">\n"; $self->{scores__} .= "<tr>\n<th scope=\"col\">$language{Word}</th><th> </th><th scope=\"col\">$language{Count}</th><th> </th>\n"; foreach my $ix (0..($#buckets > 7? 7: $#buckets)) { my $bucket = $ranking[$ix]; my $bucketcolor = $self->get_bucket_color( $session, $bucket ); $self->{scores__} .= "<th><font color=\"$bucketcolor\">$bucket</font></th><th> </th>"; } $self->{scores__} .= "</tr>"; my %wordprobs; # If the word matrix is supposed to show probabilities, compute them, # saving the results in %wordprobs. if ( $self->{wmformat__} eq 'prob') { foreach my $id (@id_list) { my $sumfreq = 0; my %wval; foreach my $bucket (@ranking) { $wval{$bucket} = $$matrix{$id}{$bucket} || 0; $sumfreq += $wval{$bucket}; } # If $sumfreq is still zero then this word didn't appear in any buckets # so we shouldn't create wordprobs entries for it if ( $sumfreq != 0 ) { foreach my $bucket (@ranking) { $wordprobs{$bucket,$id} = $wval{$bucket} / $sumfreq; } } } } my @ranked_ids; if ($self->{wmformat__} eq 'prob') { @ranked_ids = sort {($wordprobs{$ranking[0],$b}||0) <=> ($wordprobs{$ranking[0],$a}||0)} @id_list; } else { @ranked_ids = sort {($$matrix{$b}{$ranking[0]}||0) <=> ($$matrix{$a}{$ranking[0]}||0)} @id_list; } foreach my $id (@ranked_ids) { my $known = 0; foreach my $bucket (@ranking) { if ( defined( $$matrix{$id}{$bucket} ) ) { $known = 1; last; } } if ( $known == 1 ) { my $wordcolor = $self->get_bucket_color( $session, $self->get_top_bucket__( $userid, $id, $matrix, \@ranking ) ); my $count = $self->{parser__}->{words__}{$$idmap{$id}}; $self->{scores__} .= "<tr>\n<td><font color=\"$wordcolor\">$$idmap{$id}</font></td><td> </td><td>$count</td><td> </td>\n"; my $base_probability = 0; if ( defined($$matrix{$id}{$ranking[0]}) && ( $$matrix{$id}{$ranking[0]} > 0 ) ) { $base_probability = log( $$matrix{$id}{$ranking[0]} / $self->{db_bucketcount__}{$userid}{$ranking[0]} ); } foreach my $ix (0..($#buckets > 7? 7: $#buckets)) { my $bucket = $ranking[$ix]; my $probability = 0; if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) { $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} ); } my $color = 'black'; if ( $probability >= $base_probability || $base_probability == 0 ) { $color = $self->get_bucket_color( $session, $bucket ); } if ( $probability != 0 ) { my $wordprobstr; if ($self->{wmformat__} eq 'score') { $wordprobstr = sprintf("%12.4f", ($probability - $self->{not_likely__}{$userid})/$log10 ); } else { if ($self->{wmformat__} eq 'prob') { $wordprobstr = sprintf("%12.4f", $wordprobs{$bucket,$id}); } else { $wordprobstr = sprintf("%13.5f", exp($probability) ); } } $self->{scores__} .= "<td><font color=\"$color\">$wordprobstr</font></td>\n<td> </td>\n"; } else { $self->{scores__} .= "<td> </td>\n<td> </td>\n"; } } } $self->{scores__} .= "</tr>"; } $self->{scores__} .= "</table></p>"; } } return $class; } # --------------------------------------------------------------------------------------------- # # history_filename # # Returns a path and filename for a POPFile message based on the session count and message count # # $dcount - the unique download/session count for this message # $mcount - the message count for this message # $ext - the extension for this message (defaults to .msg) # $path - 1 to return the path configuration info, 0 to return just the filename (default 0) # # --------------------------------------------------------------------------------------------- sub history_filename { my ( $self, $dcount, $mcount, $ext, $path) = @_; $path = 0 if (!defined($path)); return ($path?$self->get_user_path_( $self->global_config_( 'msgdir' ) ):'') . "popfile$dcount" . "=$mcount" . (defined $ext?$ext:'.msg'); } # --------------------------------------------------------------------------------------------- # # history_write_class - write the class file for a message. # # $filename The name of the message to write the class for # $reclassified Boolean, true if the message has been reclassified # $bucket the name of the bucket the message is in # $usedtobe the name of the bucket the messages used to be in # $magnet the magnet, if any, used to reclassify the message # # --------------------------------------------------------------------------------------------- sub history_write_class { my ( $self, $filename, $reclassified, $bucket, $usedtobe, $magnet ) = @_; $filename =~ s/msg$/cls/; $filename = $self->get_user_path_( $self->global_config_( 'msgdir' ) . $filename ); open CLASS, ">$filename"; if ( defined( $magnet ) && ( $magnet ne '' ) ) { print CLASS "$bucket MAGNET $magnet\n"; } else { if ( defined( $reclassified ) && ( $reclassified == 1 ) ) { print CLASS "RECLASSIFIED\n"; print CLASS "$bucket\n"; if ( defined( $usedtobe ) && ( $usedtobe ne '' ) ) { print CLASS "$usedtobe\n"; } } else { print CLASS "$bucket\n"; } } close CLASS; } # --------------------------------------------------------------------------------------------- # # history_read_class - load the class file for a message. # # returns: ( reclassified, bucket, usedtobe, magnet ) # values: # reclassified: boolean, true if message has been reclassified # bucket: string, the bucket the message is in presently, unknown class if an error occurs # usedtobe: string, the bucket the message used to be in (null if not reclassified) # magnet: string, the magnet # # $filename The name of the message to load the class for # # --------------------------------------------------------------------------------------------- sub history_read_class { my ( $self, $filename ) = @_; $filename =~ s/msg$/cls/; my $reclassified = 0; my $bucket = 'unknown class'; my $usedtobe; my $magnet = ''; if ( open CLASS, '<' . $self->get_user_path_( $self->global_config_( 'msgdir' ) . $filename ) ) { $bucket = <CLASS>; if ( defined( $bucket ) && ( $bucket =~ /([^ ]+) MAGNET ([^\r\n]+)/ ) ) { $bucket = $1; $magnet = $2; } $reclassified = 0; if ( defined( $bucket ) && ( $bucket =~ /RECLASSIFIED/ ) ) { $bucket = <CLASS>; $usedtobe = <CLASS>; $reclassified = 1; $usedtobe =~ s/[\r\n]//g; } close CLASS; $bucket =~ s/[\r\n]//g if defined( $bucket ); } else { $self->log_( "Error: " . $self->get_user_path_( $self->global_config_( 'msgdir' ) . "$filename: $!" ) ); return ( undef, $bucket, undef, undef ); } $bucket = 'unknown class' if ( !defined( $bucket ) ); return ( $reclassified, $bucket, $usedtobe, $magnet ); } # --------------------------------------------------------------------------------------------- # # classify_and_modify # # This method reads an email terminated by . on a line by itself (or the end of stream) # from a handle and creates an entry in the history, outputting the same email on another # handle with the appropriate header modifications and insertions # # $session A valid session key returned by a call to get_session_key # $mail - an open stream to read the email from # $client - an open stream to write the modified email to # $dcount - the unique download count for this message # $mcount - the message count for this message # $nosave - indicates that the message downloaded should not be saved in the history # $class - if we already know the classification # $echo - 1 to echo to the client, 0 to supress, defaults to 1 # $crlf - The sequence to use at the end of a line in the output, normally # this is left undefined and this method uses $eol (the normal network end # of line), but if this method is being used with real files you may wish # to pass in \n instead # # Returns a classification if it worked and the name of the file where the message # was saved # # IMPORTANT NOTE: $mail and $client should be binmode # # --------------------------------------------------------------------------------------------- sub classify_and_modify { my ( $self, $session, $mail, $client, $dcount, $mcount, $nosave, $class, $echo, $crlf ) = @_; $echo = 1 unless (defined $echo); $crlf = $eol unless (defined $crlf); my $msg_subject; # The message subject my $msg_head_before = ''; # Store the message headers that come before Subject here my $msg_head_after = ''; # Store the message headers that come after Subject here my $msg_head_q = ''; # Store questionable header lines here my $msg_body = ''; # Store the message body here # These two variables are used to control the insertion of the X-POPFile-TimeoutPrevention # header when downloading long or slow emails my $last_timeout = time; my $timeout_count = 0; # Indicates whether the first time through the receive loop we got the full body, this # will happen on small emails my $got_full_body = 0; # The size of the message downloaded so far. my $message_size = 0; # The classification for this message my $classification = ''; # Whether we are currently reading the mail headers or not my $getting_headers = 1; my $msg_file = $self->history_filename($dcount,$mcount, ".msg",1); my $temp_file = "$msg_file.tmp"; my $nopath_temp_file = $self->history_filename($dcount,$mcount,".msg",0); # Get the class-file info without the path, since we'd just need to strip it my $class_file = $self->history_filename($dcount,$mcount, ".cls",0); # If we don't yet know the classification then start the parser if ( $class eq '' ) { $self->{parser__}->start_parse(); } # We append .TMP to the filename for the MSG file so that if we are in # middle of downloading a message and we refresh the history we do not # get class file errors open TEMP, ">$temp_file" unless $nosave; while ( my $line = $self->slurp_( $mail ) ) { my $fileline; # This is done so that we remove the network style end of line CR LF # and allow Perl to decide on the local system EOL which it will expand # out of \n when this gets written to the temp file $fileline = $line; $fileline =~ s/[\r\n]//g; $fileline .= "\n"; # Check for an abort last if ( $self->{alive_} == 0 ); # The termination of a message is a line consisting of exactly .CRLF so we detect that # here exactly if ( $line =~ /^\.(\r\n|\r|\n)$/ ) { $got_full_body = 1; last; } if ( $getting_headers ) { # Kill header lines containing only whitespace (Exim does this) next if ( $line =~ /^[ \t]+(\r\n|\r|\n)$/i ); if ( !( $line =~ /^(\r\n|\r|\n)$/i ) ) { $message_size += length $line; $self->write_line__( $nosave?undef:\*TEMP, $fileline, $class ); # If there is no echoing occuring, it doesn't matter what we do to these if ( $echo ) { if ( $line =~ /^Subject:(.*)/i ) { $msg_subject = $1; $msg_subject =~ s/(\012|\015)//g; next; } # Strip out the X-Text-Classification header that is in an incoming message next if ( $line =~ /^X-Text-Classification:/i ); next if ( $line =~ /^X-POPFile-Link:/i ); # Store any lines that appear as though they may be non-header content # Lines that are headers begin with whitespace or Alphanumerics and "-" # followed by a colon. # # This prevents weird things like HTML before the headers terminate from # causing the XPL and XTC headers to be inserted in places some clients # can't detect if ( $line =~ /^([ \t]|([A-Z\-_]+:))/i ) { if ( !defined($msg_subject) ) { $msg_head_before .= $msg_head_q . $line; } else { $msg_head_after .= $msg_head_q . $line; } $msg_head_q = ''; } else { # Gather up any header lines that are questionable $self->log_( "Found odd email header: $line" ); $msg_head_q .= $line; } } } else { $self->write_line__( $nosave?undef:\*TEMP, "\n", $class ); $message_size += length $crlf; $getting_headers = 0; } } else { $message_size += length $line; $msg_body .= $line; $self->write_line__( $nosave?undef:\*TEMP, $fileline, $class ); } # Check to see if too much time has passed and we need to keep the mail client happy if ( time > ( $last_timeout + 2 ) ) { print $client "X-POPFile-TimeoutPrevention: $timeout_count$crlf" if ( $echo ); $timeout_count += 1; $last_timeout = time; } last if ( ( $message_size > $self->global_config_( 'message_cutoff' ) ) && ( $getting_headers == 0 ) ); } close TEMP unless $nosave; # If we don't yet know the classification then stop the parser if ( $class eq '' ) { $self->{parser__}->stop_parse(); } # Do the text classification and update the counter for that bucket that we just downloaded # an email of that type if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) { # Parse Japanese mail message with Kakasi $self->parse_with_kakasi__( $temp_file, $dcount, $mcount ); $classification = ($class ne '')?$class:$self->classify( $session, $temp_file); } else { $classification = ($class ne '')?$class:$self->classify( $session, undef); } my $subject_modification = $self->get_bucket_parameter( $session, $classification, 'subject' ); my $xtc_insertion = $self->get_bucket_parameter( $session, $classification, 'xtc' ); my $xpl_insertion = $self->get_bucket_parameter( $session, $classification, 'xpl' ); my $quarantine = $self->get_bucket_parameter( $session, $classification, 'quarantine' ); my $modification = $self->config_( 'subject_mod_left' ) . $classification . $self->config_( 'subject_mod_right' ); # Add the Subject line modification or the original line back again # Don't add the classification unless it is not present if ( ( defined( $msg_subject ) && ( $msg_subject !~ /\Q$modification\E/ ) ) && # PROFILE BLOCK START ( $subject_modification == 1 ) && ( $quarantine == 0 ) ) { # PROFILE BLOCK STOP $msg_subject = " $modification$msg_subject"; } if ( !defined( $msg_subject ) && # PROFILE BLOCK START ( $subject_modification == 1 ) && ( $quarantine == 0 ) ) { # PROFILE BLOCK STOP $msg_subject = " $modification"; } $msg_subject = '' if ( !defined( $msg_subject ) ); $msg_head_before .= 'Subject:' . $msg_subject; $msg_head_before .= $crlf; # Add the XTC header $msg_head_after .= "X-Text-Classification: $classification$crlf" if ( ( $xtc_insertion ) && # PROFILE BLOCK START ( $quarantine == 0 ) ); # PROFILE BLOCK STOP # Add the XPL header my $xpl = ''; $xpl .= "http://"; $xpl .= $self->module_config_( 'html', 'local' )?"127.0.0.1":$self->config_( 'hostname' ); $xpl .= ":" . $self->module_config_( 'html', 'port' ) . "/jump_to_message?view=$nopath_temp_file$crlf"; if ( $xpl_insertion && ( $quarantine == 0 ) ) { $msg_head_after .= 'X-POPFile-Link: ' . $xpl; } $msg_head_after .= $msg_head_q . "$crlf"; # Echo the text of the message to the client if ( $echo ) { # If the bucket is quarantined then we'll treat it specially by changing the message header to contain # information from POPFile and wrapping the original message in a MIME encoding if ( $quarantine == 1 ) { print $client "From: " . $self->{parser__}->get_header( 'from' ) . "$crlf"; print $client "To: " . $self->{parser__}->get_header( 'to' ) . "$crlf"; print $client "Date: " . $self->{parser__}->get_header( 'date' ) . "$crlf"; # Don't add the classification unless it is not present if ( ( defined( $msg_subject ) && ( $msg_subject !~ /\[\Q$classification\E\]/ ) ) && # PROFILE BLOCK START ( $subject_modification == 1 ) ) { # PROFILE BLOCK STOP $msg_subject = " $modification$msg_subject"; } print $client "Subject:$msg_subject$crlf"; print $client "X-Text-Classification: $classification$crlf" if ( $xtc_insertion ); print $client 'X-POPFile-Link: ' . $xpl if ( $xpl_insertion ); print $client "MIME-Version: 1.0$crlf"; print $client "Content-Type: multipart/report; boundary=\"$nopath_temp_file\"$crlf$crlf--$nopath_temp_file$crlf"; print $client "Content-Type: text/plain$crlf$crlf"; print $client "POPFile has quarantined a message. It is attached to this email.$crlf$crlf"; print $client "Quarantined Message Detail$crlf$crlf"; print $client "Original From: " . $self->{parser__}->get_header('from') . "$crlf"; print $client "Original To: " . $self->{parser__}->get_header('to') . "$crlf"; print $client "Original Subject: " . $self->{parser__}->get_header('subject') . "$crlf"; print $client "To examine the email open the attachment. "; print $client "To change this mail's classification go to $xpl"; print $client "$crlf"; print $client "The first 20 words found in the email are:$crlf$crlf"; print $client $self->{parser__}->first20(); print $client "$crlf--$nopath_temp_file$crlf"; print $client "Content-Type: message/rfc822$crlf$crlf"; } print $client $msg_head_before; print $client $msg_head_after; print $client $msg_body; } my $before_dot = ''; if ( $quarantine && $echo ) { $before_dot = "$crlf--$nopath_temp_file--$crlf"; } my $need_dot = 0; if ( $got_full_body ) { $need_dot = 1; } else { $need_dot = !$self->echo_to_dot_( $mail, $echo?$client:undef, $nosave?undef:'>>' . $temp_file, $before_dot ) && !$nosave; } if ( $need_dot ) { print $client $before_dot if ( $before_dot ne '' ); print $client ".$crlf" if ( $echo ); } # In some cases it's possible (and totally illegal) to get a . in the middle of the message, # to cope with the we call flush_extra_ here to remove any extra stuff the POP3 server is sending # Make sure to supress output if we are not echoing, and to save to file if not echoing and saving if ( !($nosave || $echo) ) { # if we're saving (not nosave) and not echoing, we can safely unload this into the temp file if (open FLUSH, ">$temp_file.flush") { binmode FLUSH; # TODO: Do this in a faster way (without flushing to one file then copying to another) # (perhaps a select on $mail to predict if there is flushable data) $self->flush_extra_( $mail, \*FLUSH, 0); close FLUSH; # append any data we got to the actual temp file if ( ( (-s "$temp_file.flush") > 0 ) && ( open FLUSH, "<$temp_file.flush" ) ) { binmode FLUSH; if ( open TEMP, ">>$temp_file" ) { binmode TEMP; # The only time we get data here is if it is after a CRLF.CRLF # We have to re-create it to avoid data-loss print TEMP ".$crlf"; print TEMP $_ while (<FLUSH>); # NOTE: The last line flushed MAY be a CRLF.CRLF, which isn't actually part of the message body close TEMP; } close FLUSH; } unlink("$temp_file.flush"); } } else { # if we are echoing, the client can make sure we have no data loss # otherwise, the data can be discarded (not saved and not echoed) $self->flush_extra_( $mail, $client, $echo?0:1); } if ( !$nosave ) { $self->history_write_class($class_file, undef, $classification, undef, ($self->{magnet_used__}?$self->{magnet_detail__}:undef)); # Now rename the MSG file, since the class file has been written it's safe for the mesg # file to have the correct name. If the history cache is reloaded then we wont have a class # file error since it was already written unlink $msg_file; rename $temp_file, $msg_file; } return ( $classification, $nopath_temp_file ); } # --------------------------------------------------------------------------------------------- # # get_buckets # # Returns a list containing all the real bucket names sorted into alphabetic order # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_buckets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Note that get_buckets does not return pseudo buckets my @buckets; for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 0 ) { push @buckets, ($b); } } return @buckets; } # --------------------------------------------------------------------------------------------- # # get_pseudo_buckets # # Returns a list containing all the pseudo bucket names sorted into alphabetic order # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_pseudo_buckets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @buckets; for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 1 ) { push @buckets, ($b); } } return @buckets; } # --------------------------------------------------------------------------------------------- # # get_all_buckets # # Returns a list containing all the bucket names sorted into alphabetic order # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_all_buckets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @buckets; for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { push @buckets, ($b); } return @buckets; } # --------------------------------------------------------------------------------------------- # # is_pseudo_bucket # # Returns 1 if the named bucket is pseudo # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to check # # --------------------------------------------------------------------------------------------- sub is_pseudo_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return ( defined($self->{db_bucketid__}{$userid}{$bucket}) # PROFILE BLOCK START && $self->{db_bucketid__}{$userid}{$bucket}{pseudo} ); # PROFILE BLOCK STOP } # --------------------------------------------------------------------------------------------- # # is_bucket # # Returns 1 if the named bucket is a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to check # # --------------------------------------------------------------------------------------------- sub is_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return ( ( defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) && # PROFILE BLOCK START ( !$self->{db_bucketid__}{$userid}{$bucket}{pseudo} ) ); # PROFILE BLOCK STOP } # --------------------------------------------------------------------------------------------- # # get_bucket_word_count # # Returns the total word count (including duplicates) for the passed in bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # # --------------------------------------------------------------------------------------------- sub get_bucket_word_count { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $c = $self->{db_bucketcount__}{$userid}{$bucket}; return defined($c)?$c:0; } # --------------------------------------------------------------------------------------------- # # get_bucket_word_list # # Returns a list of words all with the same first character # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # $prefix The first character of the words # # --------------------------------------------------------------------------------------------- sub get_bucket_word_list { my ( $self, $session, $bucket, $prefix ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectcol_arrayref( # PROFILE BLOCK START "select words.word from matrix, words where matrix.wordid = words.id and matrix.bucketid = $bucketid and words.word like '$prefix%';"); # PROFILE BLOCK STOP return @{$result}; } # --------------------------------------------------------------------------------------------- # # get_bucket_word_prefixes # # Returns a list of all the initial letters of words in a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # # --------------------------------------------------------------------------------------------- sub get_bucket_word_prefixes { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $prev = ''; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectcol_arrayref( # PROFILE BLOCK START "select words.word from matrix, words where matrix.wordid = words.id and matrix.bucketid = $bucketid;"); # PROFILE BLOCK STOP # In Japanese mode, disable locale and use substr_euc, the substr function # which supports EUC Japanese charset. # Sorting Japanese with "use locale" is memory and time consuming, # and may cause perl crash. if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) { no locale; return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc__($_,0,1)} @{$result}; } else { if ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) { no locale; return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} @{$result}; } else { return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} @{$result}; } } } # --------------------------------------------------------------------------------------------- # # get_word_count # # Returns the total word count (including duplicates) # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_word_count { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{db_get_full_total__}->execute( $userid ); return $self->{db_get_full_total__}->fetchrow_arrayref->[0]; } # --------------------------------------------------------------------------------------------- # # get_count_for_word # # Returns the number of times the word occurs in a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket we are asking about # $word The word we are asking about # # --------------------------------------------------------------------------------------------- sub get_count_for_word { my ( $self, $session, $bucket, $word ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->get_base_value_( $session, $bucket, $word ); } # --------------------------------------------------------------------------------------------- # # get_bucket_unique_count # # Returns the unique word count (excluding duplicates) for the passed in bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # # --------------------------------------------------------------------------------------------- sub get_bucket_unique_count { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $c = $self->{db_bucketunique__}{$userid}{$bucket}; return defined($c)?$c:0; } # --------------------------------------------------------------------------------------------- # # get_unique_word_count # # Returns the unique word count (excluding duplicates) for all buckets # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_unique_word_count { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{db_get_unique_word_count__}->execute( $userid ); return $self->{db_get_unique_word_count__}->fetchrow_arrayref->[0]; } # --------------------------------------------------------------------------------------------- # # get_bucket_color # # Returns the color associated with a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the color is requested # # NOTE This API is DEPRECATED in favor of calling get_bucket_parameter for # the parameter named 'color' # --------------------------------------------------------------------------------------------- sub get_bucket_color { my ( $self, $session, $bucket ) = @_; return $self->get_bucket_parameter( $session, $bucket, 'color' ); } # --------------------------------------------------------------------------------------------- # # set_bucket_color # # Returns the color associated with a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the color is requested # $color The new color # # NOTE This API is DEPRECATED in favor of calling set_bucket_parameter for # the parameter named 'color' # --------------------------------------------------------------------------------------------- sub set_bucket_color { my ( $self, $session, $bucket, $color ) = @_; return $self->set_bucket_parameter( $session, $bucket, 'color', $color ); } # --------------------------------------------------------------------------------------------- # # get_bucket_parameter # # Returns the value of a per bucket parameter # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket # $parameter The name of the parameter # # --------------------------------------------------------------------------------------------- sub get_bucket_parameter { my ( $self, $session, $bucket, $parameter ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # See if there's a cached value if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) { return $self->{db_parameters__}{$userid}{$bucket}{$parameter}; } # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) { return undef; } # If there is a non-default value for this parameter then return it. $self->{db_get_bucket_parameter__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, $self->{db_parameterid__}{$parameter} ); my $result = $self->{db_get_bucket_parameter__}->fetchrow_arrayref; # If this parameter has not been defined for this specific bucket then # get the default value if ( !defined( $result ) ) { $self->{db_get_bucket_parameter_default__}->execute( # PROFILE BLOCK START $self->{db_parameterid__}{$parameter} ); # PROFILE BLOCK STOP $result = $self->{db_get_bucket_parameter_default__}->fetchrow_arrayref; } if ( defined( $result ) ) { $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $result->[0]; return $result->[0]; } else { return undef; } } # --------------------------------------------------------------------------------------------- # # set_bucket_parameter # # Sets the value associated with a bucket specific parameter # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket # $parameter The name of the parameter # $value The new value # # --------------------------------------------------------------------------------------------- sub set_bucket_parameter { my ( $self, $session, $bucket, $parameter, $value ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) { return undef; } my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $btid = $self->{db_parameterid__}{$parameter}; # Exactly one row should be affected by this statement $self->{db_set_bucket_parameter__}->execute( $bucketid, $btid, $value ); if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) { $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $value; } return 1; } # --------------------------------------------------------------------------------------------- # # get_html_colored_message # # Parser a mail message stored in a file and returns HTML representing the message # with coloring of the words # # $session A valid session key returned by a call to get_session_key # $file The file to parse # # --------------------------------------------------------------------------------------------- sub get_html_colored_message { my ( $self, $session, $file ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{parser__}->{color__} = $session; $self->{parser__}->{color_matrix__} = undef; $self->{parser__}->{color_idmap__} = undef; $self->{parser__}->{color_userid__} = undef; $self->{parser__}->{bayes__} = bless $self; # Pass language parameter to parse_file() my $result = $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->module_config_( 'html', 'language' ), $self->global_config_( 'message_cutoff' ) ); # PROFILE BLOCK STOP $self->{parser__}->{color__} = ''; return $result; } # --------------------------------------------------------------------------------------------- # # fast_get_html_colored_message # # Parser a mail message stored in a file and returns HTML representing the message # with coloring of the words # # $session A valid session key returned by a call to get_session_key # $file The file to colorize # $matrix Reference to the matrix hash from a call to classify # $idmap Reference to the idmap hash from a call to classify # # --------------------------------------------------------------------------------------------- sub fast_get_html_colored_message { my ( $self, $session, $file, $matrix, $idmap ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{parser__}->{color__} = $session; $self->{parser__}->{color_matrix__} = $matrix; $self->{parser__}->{color_idmap__} = $idmap; $self->{parser__}->{color_userid__} = $userid; $self->{parser__}->{bayes__} = bless $self; # Pass language parameter to parse_file() my $result = $self->{parser__}->parse_file( $file, $self->module_config_( 'html', 'language' ), $self->global_config_( 'message_cutoff' ) ); $self->{parser__}->{color__} = ''; return $result; } # --------------------------------------------------------------------------------------------- # # create_bucket # # Creates a new bucket, returns 1 if the creation succeeded # # $session A valid session key returned by a call to get_session_key # $bucket Name for the new bucket # # --------------------------------------------------------------------------------------------- sub create_bucket { my ( $self, $session, $bucket ) = @_; if ( $self->is_bucket( $session, $bucket ) || # PROFILE BLOCK START $self->is_pseudo_bucket( $session, $bucket ) ) { # PROFILE BLOCK STOP return 0; } my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $bucket = $self->{db__}->quote( $bucket ); $self->{db__}->do( # PROFILE BLOCK START "insert into buckets ( name, pseudo, userid ) values ( $bucket, 0, $userid );" ); # PROFILE BLOCK STOP $self->db_update_cache__( $session ); return 1; } # --------------------------------------------------------------------------------------------- # # delete_bucket # # Deletes a bucket, returns 1 if the delete succeeded # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to delete # # --------------------------------------------------------------------------------------------- sub delete_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) { return 0; } $self->{db__}->do( # PROFILE BLOCK START "delete from buckets where buckets.userid = $userid and buckets.name = '$bucket';" ); # PROFILE BLOCK STOP $self->db_update_cache__( $session ); return 1; } # --------------------------------------------------------------------------------------------- # # rename_bucket # # Renames a bucket, returns 1 if the rename succeeded # # $session A valid session key returned by a call to get_session_key # $old_bucket The old name of the bucket # $new_bucket The new name of the bucket # # --------------------------------------------------------------------------------------------- sub rename_bucket { my ( $self, $session, $old_bucket, $new_bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$old_bucket} ) ) { $self->log_( "Bad bucket name $old_bucket to rename_bucket" ); return 0; } my $id = $self->{db__}->quote( $self->{db_bucketid__}{$userid}{$old_bucket}{id} ); $new_bucket = $self->{db__}->quote( $new_bucket ); $self->log_( "Rename bucket $old_bucket to $new_bucket" ); my $result = $self->{db__}->do( "update buckets set name = $new_bucket where id = $id;" ); if ( !defined( $result ) || ( $result == -1 ) ) { return 0; } else { $self->db_update_cache__( $session ); return 1; } } # --------------------------------------------------------------------------------------------- # # add_messages_to_bucket # # Parses mail messages and updates the statistics in the specified bucket # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to be updated # @files List of file names to parse # # --------------------------------------------------------------------------------------------- sub add_messages_to_bucket { my ( $self, $session, $bucket, @files ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); if ( !defined( $self->{db_bucketid__}{$userid}{$bucket}{id} ) ) { return 0; } # Pass language parameter to parse_file() # This is done to clear out the word list because in the loop # below we are going to not reset the word list on each parse $self->{parser__}->start_parse(); $self->{parser__}->stop_parse(); foreach my $file (@files) { $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->module_config_( 'html', 'language' ), $self->global_config_( 'message_cutoff' ), 0 ); # PROFILE BLOCK STOP (Do not reset word list) } $self->add_words_to_bucket__( $session, $bucket, 1 ); $self->db_update_cache__( $session ); return 1; } # --------------------------------------------------------------------------------------------- # # add_message_to_bucket # # Parses a mail message and updates the statistics in the specified bucket # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to be updated # $file Name of file containing mail message to parse # # --------------------------------------------------------------------------------------------- sub add_message_to_bucket { my ( $self, $session, $bucket, $file ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->add_messages_to_bucket( $session, $bucket, $file ); } # --------------------------------------------------------------------------------------------- # # remove_message_from_bucket # # Parses a mail message and updates the statistics in the specified bucket # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to be updated # $file Name of file containing mail message to parse # # --------------------------------------------------------------------------------------------- sub remove_message_from_bucket { my ( $self, $session, $bucket, $file ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Pass language parameter to parse_file() $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->module_config_( 'html', 'language' ), $self->global_config_( 'message_cutoff' ) ); # PROFILE BLOCK STOP $self->add_words_to_bucket__( $session, $bucket, -1 ); $self->db_update_cache__( $session ); return 1; } # --------------------------------------------------------------------------------------------- # # get_buckets_with_magnets # # Returns the names of the buckets for which magnets are defined # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_buckets_with_magnets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @result; $self->{db_get_buckets_with_magnets__}->execute( $userid ); while ( my $row = $self->{db_get_buckets_with_magnets__}->fetchrow_arrayref ) { push @result, ($row->[0]); } return @result; } # --------------------------------------------------------------------------------------------- # # get_magnet_types_in_bucket # # Returns the types of the magnets in a specific bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to search for magnets # # --------------------------------------------------------------------------------------------- sub get_magnet_types_in_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @result; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $h = $self->{db__}->prepare( "select magnet_types.mtype from magnet_types, magnets, buckets where magnet_types.id = magnets.mtid and magnets.bucketid = buckets.id and buckets.id = $bucketid group by magnet_types.mtype order by magnet_types.mtype;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { push @result, ($row->[0]); } $h->finish; return @result; } # --------------------------------------------------------------------------------------------- # # clear_bucket # # Removes all words from a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to clear # # --------------------------------------------------------------------------------------------- sub clear_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db__}->do( "delete from matrix where matrix.bucketid = $bucketid;" ); $self->db_update_cache__( $session ); } # --------------------------------------------------------------------------------------------- # # clear_magnets # # Removes every magnet currently defined # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub clear_magnets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); for my $bucket (keys %{$self->{db_bucketid__}{$userid}}) { my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db__}->do( "delete from magnets where magnets.bucketid = $bucketid" ); } } # --------------------------------------------------------------------------------------------- # # get_magnets # # Returns the magnets of a certain type in a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to search for magnets # $type The magnet type (e.g. from, to or subject) # # --------------------------------------------------------------------------------------------- sub get_magnets { my ( $self, $session, $bucket, $type ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @result; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $h = $self->{db__}->prepare( "select magnets.val from magnets, magnet_types where magnets.bucketid = $bucketid and magnet_types.id = magnets.mtid and magnet_types.mtype = '$type' order by magnets.val;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { push @result, ($row->[0]); } $h->finish; return @result; } # --------------------------------------------------------------------------------------------- # # create_magnet # # Make a new magnet # # $session A valid session key returned by a call to get_session_key # $bucket The bucket the magnet belongs in # $type The magnet type (e.g. from, to or subject) # $text The text of the magnet # # --------------------------------------------------------------------------------------------- sub create_magnet { my ( $self, $session, $bucket, $type, $text ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types where magnet_types.mtype = '$type';" ); my $mtid = $result->[0]; $self->{db__}->do( "insert into magnets ( bucketid, mtid, val ) values ( $bucketid, $mtid, '$text' );" ); } # --------------------------------------------------------------------------------------------- # # get_magnet_types # # Get a hash mapping magnet types (e.g. from) to magnet names (e.g. From); # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_magnet_types { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my %result; my $h = $self->{db__}->prepare( "select magnet_types.mtype, magnet_types.header from magnet_types order by mtype;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { $result{$row->[0]} = $row->[1]; } $h->finish; return %result; } # --------------------------------------------------------------------------------------------- # # delete_magnet # # Remove a new magnet # # $session A valid session key returned by a call to get_session_key # $bucket The bucket the magnet belongs in # $type The magnet type (e.g. from, to or subject) # $text The text of the magnet # # --------------------------------------------------------------------------------------------- sub delete_magnet { my ( $self, $session, $bucket, $type, $text ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types where magnet_types.mtype = '$type';" ); my $mtid = $result->[0]; $self->{db__}->do( "delete from magnets where magnets.bucketid = $bucketid and magnets.mtid = $mtid and magnets.val = '$text';" ); } # --------------------------------------------------------------------------------------------- # # get_stopword_list # # Gets the complete list of stop words # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub get_stopword_list { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->{parser__}->{mangle__}->stopwords(); } # --------------------------------------------------------------------------------------------- # # magnet_count # # Gets the number of magnets that are defined # # $session A valid session key returned by a call to get_session_key # # --------------------------------------------------------------------------------------------- sub magnet_count { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $result = $self->{db__}->selectrow_arrayref( "select count(*) from magnets, buckets where buckets.userid = $userid and magnets.bucketid = buckets.id;" ); if ( defined( $result ) ) { return $result->[0]; } else { return 0; } } # --------------------------------------------------------------------------------------------- # # add_stopword, remove_stopword # # Adds or removes a stop word # # $session A valid session key returned by a call to get_session_key # $stopword The word to add or remove # # Return 0 for a bad stop word, and 1 otherwise # # --------------------------------------------------------------------------------------------- sub add_stopword { my ( $self, $session, $stopword ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Pass language parameter to add_stopword() return $self->{parser__}->{mangle__}->add_stopword( $stopword, $self->module_config_( 'html', 'language' ) ); } sub remove_stopword { my ( $self, $session, $stopword ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Pass language parameter to remove_stopword() return $self->{parser__}->{mangle__}->remove_stopword( $stopword, $self->module_config_( 'html', 'language' ) ); } # --------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------- # _____ _____ _____ _______ _____ _______ _______ _____ _____ # |_____] | | |_____] |______ | | |______ |_____| |_____] | # | |_____| | | __|__ |_____ |______ | | | __|__ # # --------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------- # GETTERS/SETTERS sub wordscores { my ( $self, $value ) = @_; $self->{wordscores__} = $value if (defined $value); return $self->{wordscores__}; } sub scores { my ( $self, $value ) = @_; $self->{scores__} = $value if (defined $value); return $self->{scores__}; } sub wmformat { my ( $self, $value ) = @_; $self->{wmformat__} = $value if (defined $value); return $self->{wmformat__}; } 1;